/*
 * $HeadURL: http://svn.apache.org/repos/asf/httpcomponents/httpcore/trunk/module-main/src/main/java/org/apache/http/message/BasicTokenIterator.java $
 * $Revision: 602520 $
 * $Date: 2007-12-08 09:42:26 -0800 (Sat, 08 Dec 2007) $
 *
 * ====================================================================
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 *
 */

package org.apache.http.message;

import java.util.NoSuchElementException;

import org.apache.http.HeaderIterator;
import org.apache.http.ParseException;
import org.apache.http.TokenIterator;

/**
 * Basic implementation of a {@link TokenIterator}. This implementation parses
 * <tt>#token<tt> sequences as
 * defined by RFC 2616, section 2.
 * It extends that definition somewhat beyond US-ASCII.
 * 
 * @version $Revision: 602520 $
 */
public class BasicTokenIterator implements TokenIterator {

    /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
    // the order of the characters here is adjusted to put the
    // most likely candidates at the beginning of the collection
    public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";

    /** The iterator from which to obtain the next header. */
    protected final HeaderIterator headerIt;

    /**
     * The value of the current header. This is the header value that includes
     * {@link #currentToken}. Undefined if the iteration is over.
     */
    protected String currentHeader;

    /**
     * The token to be returned by the next call to {@link #currentToken}.
     * <code>null</code> if the iteration is over.
     */
    protected String currentToken;

    /**
     * The position after {@link #currentToken} in {@link #currentHeader}.
     * Undefined if the iteration is over.
     */
    protected int searchPos;

    /**
     * Creates a new instance of {@link BasicTokenIterator}.
     * 
     * @param headerIterator
     *            the iterator for the headers to tokenize
     */
    public BasicTokenIterator(final HeaderIterator headerIterator) {
        if (headerIterator == null) {
            throw new IllegalArgumentException(
                    "Header iterator must not be null.");
        }

        this.headerIt = headerIterator;
        this.searchPos = findNext(-1);
    }

    // non-javadoc, see interface TokenIterator
    public boolean hasNext() {
        return (this.currentToken != null);
    }

    /**
     * Obtains the next token from this iteration.
     * 
     * @return the next token in this iteration
     * 
     * @throws NoSuchElementException
     *             if the iteration is already over
     * @throws ParseException
     *             if an invalid header value is encountered
     */
    public String nextToken() throws NoSuchElementException, ParseException {

        if (this.currentToken == null) {
            throw new NoSuchElementException("Iteration already finished.");
        }

        final String result = this.currentToken;
        // updates currentToken, may trigger ParseException:
        this.searchPos = findNext(this.searchPos);

        return result;
    }

    /**
     * Returns the next token. Same as {@link #nextToken}, but with generic
     * return type.
     * 
     * @return the next token in this iteration
     * 
     * @throws NoSuchElementException
     *             if there are no more tokens
     * @throws ParseException
     *             if an invalid header value is encountered
     */
    public final Object next() throws NoSuchElementException, ParseException {
        return nextToken();
    }

    /**
     * Removing tokens is not supported.
     * 
     * @throws UnsupportedOperationException
     *             always
     */
    public final void remove() throws UnsupportedOperationException {

        throw new UnsupportedOperationException(
                "Removing tokens is not supported.");
    }

    /**
     * Determines the next token. If found, the token is stored in
     * {@link #currentToken}. The return value indicates the position after the
     * token in {@link #currentHeader}. If necessary, the next header will be
     * obtained from {@link #headerIt}. If not found, {@link #currentToken} is
     * set to <code>null</code>.
     * 
     * @param from
     *            the position in the current header at which to start the
     *            search, -1 to search in the first header
     * 
     * @return the position after the found token in the current header, or
     *         negative if there was no next token
     * 
     * @throws ParseException
     *             if an invalid header value is encountered
     */
    protected int findNext(int from) throws ParseException {

        if (from < 0) {
            // called from the constructor, initialize the first header
            if (!this.headerIt.hasNext()) {
                return -1;
            }
            this.currentHeader = this.headerIt.nextHeader().getValue();
            from = 0;
        } else {
            // called after a token, make sure there is a separator
            from = findTokenSeparator(from);
        }

        int start = findTokenStart(from);
        if (start < 0) {
            this.currentToken = null;
            return -1; // nothing found
        }

        int end = findTokenEnd(start);
        this.currentToken = createToken(this.currentHeader, start, end);
        return end;
    }

    /**
     * Creates a new token to be returned. Called from {@link #findNext
     * findNext} after the token is identified. The default implementation
     * simply calls {@link java.lang.String#substring String.substring}. <br/>
     * If header values are significantly longer than tokens, and some tokens
     * are permanently referenced by the application, there can be problems with
     * garbage collection. A substring will hold a reference to the full
     * characters of the original string and therefore occupies more memory than
     * might be expected. To avoid this, override this method and create a new
     * string instead of a substring.
     * 
     * @param value
     *            the full header value from which to create a token
     * @param start
     *            the index of the first token character
     * @param end
     *            the index after the last token character
     * 
     * @return a string representing the token identified by the arguments
     */
    protected String createToken(String value, int start, int end) {
        return value.substring(start, end);
    }

    /**
     * Determines the starting position of the next token. This method will
     * iterate over headers if necessary.
     * 
     * @param from
     *            the position in the current header at which to start the
     *            search
     * 
     * @return the position of the token start in the current header, negative
     *         if no token start could be found
     */
    protected int findTokenStart(int from) {
        if (from < 0) {
            throw new IllegalArgumentException(
                    "Search position must not be negative: " + from);
        }

        boolean found = false;
        while (!found && (this.currentHeader != null)) {

            final int to = this.currentHeader.length();
            while (!found && (from < to)) {

                final char ch = this.currentHeader.charAt(from);
                if (isTokenSeparator(ch) || isWhitespace(ch)) {
                    // whitspace and token separators are skipped
                    from++;
                } else if (isTokenChar(this.currentHeader.charAt(from))) {
                    // found the start of a token
                    found = true;
                } else {
                    throw new ParseException(
                            "Invalid character before token (pos " + from
                                    + "): " + this.currentHeader);
                }
            }
            if (!found) {
                if (this.headerIt.hasNext()) {
                    this.currentHeader = this.headerIt.nextHeader().getValue();
                    from = 0;
                } else {
                    this.currentHeader = null;
                }
            }
        } // while headers

        return found ? from : -1;
    }

    /**
     * Determines the position of the next token separator. Because of
     * multi-header joining rules, the end of a header value is a token
     * separator. This method does therefore not need to iterate over headers.
     * 
     * @param from
     *            the position in the current header at which to start the
     *            search
     * 
     * @return the position of a token separator in the current header, or at
     *         the end
     * 
     * @throws ParseException
     *             if a new token is found before a token separator. RFC 2616,
     *             section 2.1 explicitly requires a comma between tokens for
     *             <tt>#</tt>.
     */
    protected int findTokenSeparator(int from) {
        if (from < 0) {
            throw new IllegalArgumentException(
                    "Search position must not be negative: " + from);
        }

        boolean found = false;
        final int to = this.currentHeader.length();
        while (!found && (from < to)) {
            final char ch = this.currentHeader.charAt(from);
            if (isTokenSeparator(ch)) {
                found = true;
            } else if (isWhitespace(ch)) {
                from++;
            } else if (isTokenChar(ch)) {
                throw new ParseException("Tokens without separator (pos "
                        + from + "): " + this.currentHeader);
            } else {
                throw new ParseException("Invalid character after token (pos "
                        + from + "): " + this.currentHeader);
            }
        }

        return from;
    }

    /**
     * Determines the ending position of the current token. This method will not
     * leave the current header value, since the end of the header value is a
     * token boundary.
     * 
     * @param from
     *            the position of the first character of the token
     * 
     * @return the position after the last character of the token. The behavior
     *         is undefined if <code>from</code> does not point to a token
     *         character in the current header value.
     */
    protected int findTokenEnd(int from) {
        if (from < 0) {
            throw new IllegalArgumentException(
                    "Token start position must not be negative: " + from);
        }

        final int to = this.currentHeader.length();
        int end = from + 1;
        while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
            end++;
        }

        return end;
    }

    /**
     * Checks whether a character is a token separator. RFC 2616, section 2.1
     * defines comma as the separator for <tt>#token</tt> sequences. The end of
     * a header value will also separate tokens, but that is not a character
     * check.
     * 
     * @param ch
     *            the character to check
     * 
     * @return <code>true</code> if the character is a token separator,
     *         <code>false</code> otherwise
     */
    protected boolean isTokenSeparator(char ch) {
        return (ch == ',');
    }

    /**
     * Checks whether a character is a whitespace character. RFC 2616, section
     * 2.2 defines space and horizontal tab as whitespace. The optional
     * preceeding line break is irrelevant, since header continuation is handled
     * transparently when parsing messages.
     * 
     * @param ch
     *            the character to check
     * 
     * @return <code>true</code> if the character is whitespace,
     *         <code>false</code> otherwise
     */
    protected boolean isWhitespace(char ch) {

        // we do not use Character.isWhitspace(ch) here, since that allows
        // many control characters which are not whitespace as per RFC 2616
        return ((ch == '\t') || Character.isSpaceChar(ch));
    }

    /**
     * Checks whether a character is a valid token character. Whitespace,
     * control characters, and HTTP separators are not valid token characters.
     * The HTTP specification (RFC 2616, section 2.2) defines tokens only for
     * the US-ASCII character set, this method extends the definition to other
     * character sets.
     * 
     * @param ch
     *            the character to check
     * 
     * @return <code>true</code> if the character is a valid token start,
     *         <code>false</code> otherwise
     */
    protected boolean isTokenChar(char ch) {

        // common sense extension of ALPHA + DIGIT
        if (Character.isLetterOrDigit(ch))
            return true;

        // common sense extension of CTL
        if (Character.isISOControl(ch))
            return false;

        // no common sense extension for this
        if (isHttpSeparator(ch))
            return false;

        // RFC 2616, section 2.2 defines a token character as
        // "any CHAR except CTLs or separators". The controls
        // and separators are included in the checks above.
        // This will yield unexpected results for Unicode format characters.
        // If that is a problem, overwrite isHttpSeparator(char) to filter
        // out the false positives.
        return true;
    }

    /**
     * Checks whether a character is an HTTP separator. The implementation in
     * this class checks only for the HTTP separators defined in RFC 2616,
     * section 2.2. If you need to detect other separators beyond the US-ASCII
     * character set, override this method.
     * 
     * @param ch
     *            the character to check
     * 
     * @return <code>true</code> if the character is an HTTP separator
     */
    protected boolean isHttpSeparator(char ch) {
        return (HTTP_SEPARATORS.indexOf(ch) >= 0);
    }

} // class BasicTokenIterator

